/*************************************************************
This do file uses the Business Structure Database (BSD) that can be accessed by authorised users via the UK Data Service Secure Lab.
Using this data, the do file constructs firm birth/age measures for identifying old/new industries.

This file focuses on manufacturing industries.
Industries are analysed at the 3-digit level (SIC1992 or SIC2003 3-digit code).

**************************************************************/

clear all
set mem 2g
set matsize 11000
set maxvar 32000
set more off

global home "Y:\53523_Sorting"
global working "$home\Working"
global BSD_data "$working\BSD_Coaggl"
global logs "$working\Logs\Coaggl"
global temp "$working\Coaggl"
global geodata "$working\Data\GeoData"
global output "$working\Data\Coaggl"

*-----------------------------------------------------------
*Data cleaning part: All plants

cap log close

local i=1997
while `i'<=2009 {

	use "$BSD_data\idbr_intx4_`i'.dta", clear


	disp"*******************************"
	disp"*******************************"
	disp"*******This is year `i'*******"
	disp"*******************************"
	disp"*******************************"
	count
	disp"keeping plants with no anomalous IDs over time"
	tab hrt_id,m
	keep if hrt_id==1
	disp"keeping if adj emp is positive & not missing"
	keep if lu_empli>0 & lu_empli<.
	disp"keeping if sic3d is not missing"
	keep if sic3d>0 & sic3d<.
	disp"keeping if postcode is not missing"
	keep if postcode!="."

	keep if dlink_ref<.

	duplicates tag dlink_ref if dlink_ref!=., gen(xx)
	disp"tab duplicates"
	tab xx

	egen nplant = count(dlink_ref) if dlink_ref!=., by(entref)
	gen multi=1 if nplant>1 & nplant<.
	replace multi=0 if multi!=1 
	disp"num of multiples by entref"
	tab multi, m
	gen single=1==nplant
	disp"num of sigle firms"
	disp "This year is `i'"
	tab single, m
	*keep if single==1 /*if this commant is activated the calculations only include single plants (as in one robustness check)/*
	compress
	drop  xx merge1 
	capture drop _merge
	compress
	save "$temp\panel_`i'_all", replace
	
local i = `i'+1
}

local i=1997
while `i'<=2009 {
use "$temp\panel_`i'_all", clear
keep dlink_ref year entref
save  "$temp\panel_`i'_all_small", replace
local i = `i'+1
}

use "$temp\panel_1997_all_small", clear
local i=1998
while `i'<=2009 {
append using "$temp\panel_`i'_all_small"
local i = `i'+1
}

compress
sort dlink_ref year 
save "$temp\panel_all_adjacent_years.dta", replace

****************************
*This code opens the panel, keeps only useful variables and performs checks for the presence of plants in non-consecutive years 
*(not shown; not found given correction keep if hrt_id==1 above)
local i=1997
while `i'<=2009 {
use "$temp\panel_`i'_all", clear
count
sort dlink_ref year
merge dlink_ref year using "$temp\panel_all_adjacent_years.dta", nokeep
tab _merge
keep if _merge==3
drop _merge 
keep dlink_ref year sic3d postcode lu_empli entref birth
local z "year sic3d postcode lu_empli "
foreach j of local z {
rename `j' `j'_`i'
}
sort dlink_ref
save  "$temp\panel_`i'_all_clean_xtr", replace
local i = `i'+1
}

*--------------------------------------------------------------------------------------
*--------------------------------------------------------------------------------------
/*Measuring birth for identifying mature/less mature industries*/

capture log close
log using $logs\birth_date.log, replace


local m=1997
while `m'<=2008 {



disp "****************************"
disp "****************************"
disp "****This is for year `m'****"
disp "****************************"
disp "****************************"

use "$temp\panel_`m'_all_clean_xtr", clear

sort postcode_`m'
rename postcode_`m' postcode
replace postcode=subinstr(postcode," ","",.)

/*Merges information to map postcode to TTWAs - this kind of mapping can be easily found on-line using the Postcode Survey Directory which gets updated every years */
merge m:1 postcode using "$geodata\ttwa_pcode_mapping.dta", keep(match master)
tab _merge
drop if _merge==2
drop if _merge==1
drop _merge

************************
rename ttwa TTWA
sort TTWA
*codebook TTWA
merge m:1 TTWA using "$geodata\ruralgroup_urban_ttwa_updated.dta"

*Use the classification provided by Gibbons, Overman, Resende, SERC DP 65, 2011 to create "grouped" TTWA and rural/urban divide
tab _merge
drop if _merge==2
drop if _merge==1 
drop _merge

gen new_area_name=ttwa_name if  area_name=="Urban"
replace new_area_name=area_name if  area_name!="Urban"

gen urban=1 if  area_name=="Urban" 
replace urban=0 if  area_code!=. &  area_name!="Urban"
tab urban, m

count
rename TTWA TTWA_old
gen TTWA=area_code

drop ttwa_name area_name area_code new_area_name TTWA_old

*********************************

#delimit ;
label define ttwa_code_new
11	"Barnsley"  16	"Bedford"  20	"Birmingham"  22	"Blackburn"  23	"Blackpool"  24	"Bolton"  26	"Bournemouth"  27	"Bradford"
33	"Brighton"  34	"Bristol"  36	"Burnley"  40	"Calderdale"  41	"Cambridge"  44	"Cardiff"  48	"Chelmsford"
49	"Cheltenham"  54	"Colchester"  56	"Coventry"  59	"Crawley"  62	"Darlington"  63	"Derby"  66	"Doncaster"
70	"Dudley "  82	"Exeter"  90	"Gloucester"  94	"Grimsby"  95	"Guildford"  98	"Hartlepool" 
99	"Hastings"  107	"Huddersfield"  108	"Hull"  112	"Ipswich"  126	"Leeds"  127	"Leicester"   129	"Liverpool" 
139	"Luton"  141	"N Kent"  143	"Manchester"  144	"Mansfield"  148	"Middlesbrough"  150	"Milton Key."
157	"Newcastle"  158	"Newport"  163	"Northampton"  164	"Norwich"  165	"Nottingham"  171	"Oxford"
177	"Peterborough"  180	"Plymouth"  181	"Poole"  183	"Portsmouth"  184	"Preston"  186	"Reading"  189	"Rochdale"
195	"Sheffield"  201	"Southampton"  202	"Southend"  206	"Stevenage"  208	"Stoke-on-Trent"  211	"Sunderland" 
212	"Swansea"  213	"Swindon"  215	"Telford"  219 "Torbay"  222	"Tunbridge Wells"  225	"Wakefield"  226	"Walsall"
227	"Warrington"  233	"Wirral"  235	"Wolverhampton"  236	"Worcester"  239	"Worthing"
241	"Slough"  243	"York"  1015	"Carlisle"  1016	"Scot. Borders"  1017	"Ashington" 
1018	"Mid NE" 1019	"N Cumbria"  1020	"S Cumbria"  1021	"Kendal"  1022	"Lancaster"
1023	"NE Yorkshire"  1024	"Harrogate"  1025	"NW Yorkshire"  1026	"Scarborough" 
1027	"Chester"  1028	"Northwich"  1029	"Wrexham"  1030	"Shrewsbury"  1031	"Mid-Wales Bord."
1032	"Hereford"  1033	"South-Wales Bord."  1034	"Bridgend"  1035	"South-Mid Wales"  1036	"SW Wales"
1037	"Mid Wales"  1039	"NW Wales"  1040	"North Wales"  1041	"Stafford"  1042	"Burton"  1043	"W Peak Distr."
1044	"Chesterfield"   1045	"Worksop & Retford"  1046	"Scunthorpe"  1047 "E. Lincolnshire"   1048	"W Lincolnshire"  1049	"Norfolk"
1050	"N Norfolk"  1051	"Huntingdon"  1052	"Kettering"  1053	"Warwick"  1054	"Rugby"
1055	"Banbury"   1056	"W East Anglia"  1057	"East Anglia Coast" 
1058	"Harlow"  1059	"Chichester"  1060	"Eastbourne"  1061	"W Kent" 
1062	"Canterbury"  1063	"E Kent"  1064	"I. of Wight"  1065	"Basingstoke"  1066	"Newbury"
1067	"Andover"  1068	"Salisbury"  1069	"Trowbridge"  1070	"Bath"  1071	"E Somerset"
1072	"Yeovil"  1073	"Devon Coast"  1074	"S Devon"  1075	"Taunton"  1076	"N Devon"  1077	"NW Devon" 
1078	"W Cornwall"  1079	"E Cornwall"  
1 "Aberdeen" 73 "Dundee" 79 "Edinburgh" 89 "Glasgow" 123 "Lanarkshire" 1001 "N Scotland" 1002 "Moray Firth" 1003 "W Highlands"	
1004 "Inverness" 1005 "Stirling&All." 1006 "E Highlands"	1007 "Perth&Blairg." 1008 "N of Forth"
1009 "Dunfermline" 1010 "Falkirk"	1011 "Livingston&Bathg." 1012 "Greenock,Arran&Irv." 1013 "Ayr&Kilmarnock" 1014 "N Solway"
135  "London" ;

#delimit cr;
label values TTWA ttwa_code_new
*tab TTWA, m
*tab TTWA, nol

/*Adjusting for problematic postcodes - see Web Appendix of the Manuscript for details*/
sort postcode sic3d_`m'
by postcode sic3d_`m': egen c_pc_sc=count(dlink_ref)
egen pc95=pctile(c_pc_sc), p(95)
drop if c_pc_sc>pc95

sort sic3d_`m'
by sic3d_`m': egen pe99=pctile(lu_empli_`m'), p(99)
drop if lu_empli_`m'>pe99


drop pe99 pc95 c_pc_sc
 
sort postcode sic3d_`m' entref
by postcode sic3d_`m' entref: egen c_pc_sc=count(dlink_ref)
sum c_pc_sc, det
drop if c_pc_sc>2

rename  year_`m' year 
rename  sic3d_`m'  sic3d
rename  lu_empli_`m' empl

*****************************************
**********Manufacturing only*************
*****************************************
count
keep if sic3d>=151 & sic3d<=372
count

*---- Drop tobacco and aggregates some sectors; see manuscript for details
drop if sic3d==160
recode sic3d 183 182 181 = 182
recode sic3d 231 232 233 = 232
recode sic3d 247 246 = 246
recode sic3d 154 = 158
recode sic3d 265 = 266
recode sic3d 223 = 222

egen ind = group(sic3d)
disp"sum ind for year `m"
sum ind
gen min=r(min)
gen max=r(max)

*---- keep if urban
keep if urban==1

collapse (median) b_md=birth (max) b_M=birth (min) b_m=birth, by(ind sic3d)

gen year = `m'
sort sic3d
save "$temp\birth_`m'.dta", replace
local m =`m'+1
}

use "$temp\birth_1997.dta", clear
local m=1998
while `m'<=2008 {
append using "$temp\birth_`m'.dta"
local m = `m'+1
}
sort year sic3d
label var b_md"median plant birth date"
label var b_M"max plant birth date"
label var b_m"min plant birth date"

gen sic_i=string(sic3d)
gen sic_j=string(sic3d)
save "$output\birth_date_1997-2008.dta", replace

log close

use "$output\birth_date_1997-2008.dta", clear
rename b_md b_mdi 
rename b_M b_Mi
rename b_m b_mi
keep sic_i year b_mdi b_Mi b_mi
sort sic_i year
save "$output\birth_date_i.dta", replace

use "$output\birth_date_1997-2008.dta", clear
rename b_md b_mdj 
rename b_M b_Mj
rename b_m b_mj
keep sic_j year b_mdj b_Mj b_mj
sort sic_j year
save "$output\birth_date_j.dta", replace
